Clasificador - hito 2

In [1]:
import pandas as pd
import numpy as np
import pickle
from string import punctuation
 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
plt.rc('axes', titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
plt.rcParams.update({'font.size': 16})
plt.rcParams['axes.titlesize'] = 16
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams.update({'lines.markeredgewidth': 1})
plt.rcParams.update({'errorbar.capsize': 2})
import plotly
import plotly.express as px

file_names = {
    "df_es_mapping": "../../Data/mapping/df_es_mapping.pickle",
    "df_us_mapping": "../../Data/mapping/df_us_mapping.pickle",
    
    "df_es_test": "../../Data/test/df_es_test.pickle",
    "df_us_test": "../../Data/test/df_us_test.pickle",
    
    "df_es_train": "../../Data/train/df_es_train.pickle",
    "df_us_train": "../../Data/train/df_us_train.pickle",
    
    "df_es_trial": "../../Data/trial/df_es_trial.pickle",
    "df_us_trial": "../../Data/trial/df_us_trial.pickle",
}

# mas imports

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import TweetTokenizer # tokenizer especial para tweets
tt = TweetTokenizer()
# nltk.download('stopwords')
from nltk.corpus import stopwords
from string import punctuation

import plotly as ply
import plotly.graph_objects as go
import numpy as np

cargar sets

In [2]:
df_es_train = pickle.load(open(file_names["df_es_train"], "rb"))
df_es_trial = pickle.load(open(file_names["df_es_trial"], "rb"))
df_es_test = pickle.load(open(file_names["df_es_test"], "rb"))

df_us_train = pickle.load(open(file_names["df_us_train"], "rb"))
df_us_trial = pickle.load(open(file_names["df_us_trial"], "rb"))
df_us_test = pickle.load(open(file_names["df_us_test"], "rb"))

pre-procesamiento

In [3]:
df_us_train['tokenized_text'] = df_us_train['text'].str.lower().apply(lambda x: " ".join(tt.tokenize(x)))
df_us_train.head()
Out[3]:
id text label tokenized_text
0 729044324441186304 Selfies for summatime @ Drexel University 12 selfies for summatime @ drexel university
1 663834134037442560 Ready to be a bulldog with rasso #hailstate #i... 14 ready to be a bulldog with rasso #hailstate #i...
2 747449193350963200 #scored my new #matcotools #slidehammer weight... 16 #scored my new #matcotools #slidehammer weight...
3 691439672761925637 @user last night was so much fun @ Skyway Thea... 6 @user last night was so much fun @ skyway theatre
4 758118895618109440 love beach days @ Manasquan Beach 12 love beach days @ manasquan beach
In [4]:
df_us_test['tokenized_text'] = df_us_test['text'].str.lower().apply(lambda x: " ".join(tt.tokenize(x)))
In [5]:
"""stopwords_en_withpunct = set(stopwords_en).union(set(punctuation))
print(list(stopwords_en_withpunct)[:10])""";
In [6]:
from sklearn.feature_extraction.text import CountVectorizer
In [7]:
vectorizer = CountVectorizer(min_df=5)
X_train_bow = vectorizer.fit_transform(df_us_train["tokenized_text"])
X_test_bow = vectorizer.transform(df_us_test["tokenized_text"])
In [8]:
from sklearn.naive_bayes import MultinomialNB
In [9]:
clf = MultinomialNB()
clf.fit(X_train_bow, df_us_train["label"])
Out[9]:
MultinomialNB()
In [10]:
clf.score(X_train_bow, df_us_train["label"])
Out[10]:
0.39701310639001064
In [11]:
from sklearn.metrics import classification_report
In [12]:
df_us_mapping = pickle.load(open(file_names["df_us_mapping"], "rb")).sort_values("label")
df_us_mapping
Out[12]:
label emoji name
0 0 ā¤ _red_heart_
1 1 šŸ˜ _smiling_face_with_hearteyes_
10 10 šŸ“· _camera_
11 11 šŸ‡ŗšŸ‡ø _United_States_
12 12 ā˜€ _sun_
13 13 šŸ’œ _purple_heart_
14 14 šŸ˜‰ _winking_face_
15 15 šŸ’Æ _hundred_points_
16 16 😁 _beaming_face_with_smiling_eyes_
17 17 šŸŽ„ _Christmas_tree_
18 18 šŸ“ø _camera_with_flash_
19 19 😜 _winking_face_with_tongue_
2 2 šŸ˜‚ _face_with_tears_of_joy_
3 3 šŸ’• _two_hearts_
4 4 šŸ”„ _fire_
5 5 😊 _smiling_face_with_smiling_eyes_
6 6 šŸ˜Ž _smiling_face_with_sunglasses_
7 7 ✨ _sparkles_
8 8 šŸ’™ _blue_heart_
9 9 😘 _face_blowing_a_kiss_
In [13]:
y_pred = clf.predict(X_test_bow)
print(classification_report(df_us_test["label"], y_pred, target_names=df_us_mapping["emoji"]))
              precision    recall  f1-score   support

           ā¤       0.35      0.58      0.44     10798
           šŸ˜       0.25      0.25      0.25      4830
           šŸ“·       0.16      0.16      0.16      1432
          šŸ‡ŗšŸ‡ø       0.47      0.50      0.48      1949
           ā˜€       0.25      0.43      0.32      1265
           šŸ’œ       0.32      0.05      0.08      1114
           šŸ˜‰       0.12      0.04      0.06      1306
           šŸ’Æ       0.27      0.14      0.19      1244
           😁       0.14      0.03      0.05      1153
           šŸŽ„       0.60      0.60      0.60      1545
           šŸ“ø       0.29      0.10      0.15      2417
           😜       0.04      0.01      0.01      1010
           šŸ˜‚       0.30      0.52      0.38      4534
           šŸ’•       0.19      0.05      0.08      2605
           šŸ”„       0.45      0.47      0.46      3716
           😊       0.09      0.06      0.07      1613
           šŸ˜Ž       0.16      0.11      0.13      1996
           ✨       0.29      0.18      0.22      2749
           šŸ’™       0.22      0.07      0.10      1549
           😘       0.16      0.05      0.08      1175

    accuracy                           0.32     50000
   macro avg       0.26      0.22      0.22     50000
weighted avg       0.29      0.32      0.28     50000

In [14]:
vocab = {k: v for v, k in enumerate(vectorizer.get_feature_names_out())}
In [15]:
vec_test = np.zeros(X_train_bow.shape[1])
k = vocab["santa"]
vec_test[k] = 1
print(vectorizer.inverse_transform([vec_test])[0][0])
clf.predict_proba([vec_test])
santa
Out[15]:
array([[0.21267139, 0.10559105, 0.03170409, 0.02263593, 0.06533366,
        0.01011697, 0.02992065, 0.00932074, 0.02055197, 0.11856118,
        0.02276163, 0.02159743, 0.10988494, 0.03433423, 0.01711626,
        0.03922043, 0.05487754, 0.02576475, 0.02707579, 0.02095938]])

Top palabras por emoji

In [16]:
%%time
vocab_length = X_train_bow.shape[1]
proba_matrix = np.array([clf.predict_proba(np.eye(1,vocab_length,k))[0] for k in range(vocab_length)])
Wall time: 13.2 s
In [17]:
print(vocab_length)
print(proba_matrix.shape)
29983
(29983, 20)
In [18]:
una_linea = proba_matrix[:,3]
una_linea.shape
Out[18]:
(29983,)
In [19]:
def topPalabras(proba_matrix,emoji_id,k=5):
    # retorna las palabras para las cuales el emoji en cuestión tiene mas probabilidad
    prob = proba_matrix[:,emoji_id]  # mmm
    ind = np.argpartition(prob,-k)[-k:]
    val = prob[ind]
    palabras = [vectorizer.inverse_transform([np.eye(1,vocab_length,k)[0]])[0][0] for k in ind]
    return palabras, val
In [20]:
i = 9
map_emojis = df_us_mapping["label"].values
print(df_us_mapping["emoji"][int(map_emojis[i])])
topPalabras(proba_matrix,i)
šŸŽ„
Out[20]:
(['tree', 'tis', 'christmas2015', 'merry', 'christmastree'],
 array([0.58739394, 0.62045562, 0.59251379, 0.59215716, 0.78961278]))
In [21]:
for i in range(20):
    print(df_us_mapping["emoji"][int(map_emojis[i])])
    pal, val = topPalabras(proba_matrix,i)
    print(dict([(pal[j],val[j]) for j in range(len(pal))]))
ā¤
{'heart': 0.5761150508287713, 'valentines': 0.5811796285055364, 'lovemyfamily': 0.5793072574283205, 'valentine': 0.5949253260518093, 'loveofmylife': 0.6651429760457039}
šŸ˜
{'inlove': 0.4288496011644503, 'gorg': 0.4453369489332203, 'obsessed': 0.44883209548006375, 'swoon': 0.45600734120528374, 'swooning': 0.4562752924146042}
šŸ“·
{'sony': 0.3882939691961041, 'gdlfashion': 0.4052054276523981, 'bvillain': 0.5117380913776236, 'shredforaliving': 0.42318456197855087, 'kae': 0.4156412082407767}
šŸ‡ŗšŸ‡ø
{'murica': 0.7800742137338681, 'imwithher': 0.8062308003990193, 'election2016': 0.8313563588022287, 'ivoted': 0.8638840691974496, 'merica': 0.8821097001791353}
ā˜€
{'soakin': 0.447527739339606, 'sun': 0.45989390878761877, 'sunny': 0.4740267973356278, 'sunshine': 0.5826999203405687, 'beachin': 0.4927352286841576}
šŸ’œ
{'snyder': 0.3559341490500346, 'ripprince': 0.3575182172656461, 'endalz': 0.4398786494003444, 'purple': 0.5506017875276537, 'purplerain': 0.5134869871082338}
šŸ˜‰
{'backtowork': 0.2129172475035178, 'wink': 0.23697450673018283, 'azek': 0.23901373031599815, 'silvercriketgentlemensclub': 0.23901373031599815, 'mividaesunatombola': 0.3304591357770921}
šŸ’Æ
{'t3t': 0.39850650010969246, 'keepin': 0.4169922638434261, 'facts': 0.5848465289708452, 'realtalk': 0.5061620053576262, 'rns': 0.47800047055538203}
😁
{'dentist': 0.239497184608953, 'dentistry': 0.2741579803483271, 'cheesin': 0.28598242907316934, 'braces': 0.2908595553667064, 'djsty': 0.29970001389656153}
šŸŽ„
{'tree': 0.5873939423592781, 'tis': 0.6204556185878153, 'christmas2015': 0.5925137901344447, 'merry': 0.5921571580626953, 'christmastree': 0.7896127824106279}
šŸ“ø
{'cred': 0.3159050632200524, 'headshot': 0.3241125570492244, 'mag': 0.3337009742052551, 'opus': 0.4241583818170979, 'bricks': 0.34874909576037366}
😜
{'burpees': 0.16219480409077416, 'jewelrydesigner': 0.17063599291164733, 'wacky': 0.18374154943712573, 'silly': 0.19345012616245877, 'cray': 0.22352568350963714}
šŸ˜‚
{'funny': 0.709298339443829, 'wtf': 0.7376276034392985, 'lmfao': 0.8387500617307875, 'lmao': 0.8600667188416347, 'hilarious': 0.7794513567711954}
šŸ’•
{'endorsement': 0.32348853465380645, 'lovealwaysyje': 0.33034960065978825, 'pink': 0.3544185518163782, 'strides': 0.3722537471026334, 'breast': 0.44012548234851656}
šŸ”„
{'flame': 0.6954472080681913, 'flames': 0.7013605810434306, 'fire': 0.702586754042741, 'mixtape': 0.7155634858236927, 'lit': 0.7075956742171502}
😊
{'worlds2016': 0.27632029133395997, '7171': 0.2814042655159173, 'bagsbycab': 0.28184206439355336, '3037': 0.29394808945050926, '802': 0.2838469184935355}
šŸ˜Ž
{'beautique': 0.3566104881826646, 'shades': 0.4286315258210222, 'sunglasses': 0.5538250850510409, 'coolin': 0.3872564798469744, 'eyewear': 0.3732911260893004}
✨
{'getonshimmur': 0.38076409557547336, 'sparkle': 0.55606315222639, 'glitter': 0.3824542997618393, 'magical': 0.4041663529709088, 'pixie': 0.41945855964929035}
šŸ’™
{'rupp': 0.39496130878522095, 'foreverroyal': 0.43868663666132685, 'royals': 0.43887929760921546, 'autism': 0.4912285850528072, 'bbn': 0.44463409096380896}
😘
{'kissy': 0.2835098942218612, 'kiss': 0.2998014122533109, 'kisses': 0.3778843629927282, 'smooches': 0.3331135467798145, 'princessmailyana': 0.28949959973025136}

Visualización de tokens según Naive Bayes

Esta seccion consiste en una visualizacion de los tokens segun la codificacion que nos entrega Naive Bayes. De la seccion anterior, se pueden obtener las probabilidad de que un token pertenezca a una clase dada. En nuestro caso, a un emoji dado. Esto es:

$$P(w \in C) = \frac{\text{#(tweets donde $w$ es uno de sus tokens y el tweet tiene el emoji $C$)}}{\text{#(tweets con el token $w$)}}$$

De esta manera, cada token posee un vector de probabilidades. Donde la $C-$esima componente corresponde a $P(w \in C)$. Es decir,

$$\vec{w} = (P(w \in C) : \text{$C$ es un emoji})$$

En particular, cada vector $\vec{w}$ es uno con tantas coordenadas como emojis (20 en Ingles). Y cada coordenada esta entre 0 y 1. Es decir, cada $\vec{w} \in [0, 1]^{\text{#Emojis}}$.

Ahora bien, es de nuestro interes visualizar cada token segun su vector de probabilidad. Sin embargo, es necesario reducir la dimensionalidad de cada vector a una facil de interpretar (en nuestro caso 2-dimensiones). Para esto, se utiliza un metodo de reduccion de dimensionalidad denominado UMAP y ampliamente utilizado para la visualizacion de datos en altas dimensiones.

In [22]:
!pip install umap-learn
import umap
Requirement already satisfied: umap-learn in c:\users\felip\anaconda3\lib\site-packages (0.5.2)
Requirement already satisfied: tqdm in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (4.63.0)
Requirement already satisfied: pynndescent>=0.5 in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (0.5.4)
Requirement already satisfied: scipy>=1.0 in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (1.6.2)
Requirement already satisfied: scikit-learn>=0.22 in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (1.0.2)
Requirement already satisfied: numba>=0.49 in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (0.54.1)
Requirement already satisfied: numpy>=1.17 in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (1.20.3)
Requirement already satisfied: llvmlite<0.38,>=0.37.0rc1 in c:\users\felip\anaconda3\lib\site-packages (from numba>=0.49->umap-learn) (0.37.0)
Requirement already satisfied: setuptools in c:\users\felip\anaconda3\lib\site-packages (from numba>=0.49->umap-learn) (61.2.0)
Requirement already satisfied: joblib>=0.11 in c:\users\felip\anaconda3\lib\site-packages (from pynndescent>=0.5->umap-learn) (1.1.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\felip\anaconda3\lib\site-packages (from scikit-learn>=0.22->umap-learn) (2.2.0)
Requirement already satisfied: colorama in c:\users\felip\appdata\roaming\python\python38\site-packages (from tqdm->umap-learn) (0.4.3)
In [23]:
reducer = umap.UMAP()
to_R2 = reducer.fit_transform(proba_matrix)
to_R2.shape
Out[23]:
(29983, 2)

Luego de reducir los vectores de probabilidad a uno de bi-dimensional, visualizaremos segun dos aspectos el espacio de tokens. Primero, se colorean los vectores segun el emoji con mayor probabilidad. Por ejemplo, si el token $happy$ tiene mayor probabilidad de estar en la clase $smile$, entonces se asocia este token con dicho emoji. La razon de esto es solo para simplificar el analisis. Segundo, existen tokens con probabilidades maximas mas grandes que otras, es decir, tokens asociados a un mismo emoji (segun el criterio anterior) que poseen probabilidades distintas de pertenecer a dicha clase. Para observar esto, se visualizan los token con puntos de diferente tamaƱo y proporcional a tal probabilidad.

In [24]:
df_umap = pd.DataFrame(to_R2)
df_umap["token"] = vectorizer.get_feature_names_out()
df_umap["label"] = np.argmax(proba_matrix, axis=1).astype(str)
df_umap["proba"] = np.max(proba_matrix, axis=1)
df_umap = df_umap.merge(df_us_mapping, on="label", how="left")
df_umap
Out[24]:
0 1 token label proba emoji name
0 9.157049 11.966369 00 0 0.160577 ā¤ _red_heart_
1 5.644411 9.527576 000 0 0.144546 ā¤ _red_heart_
2 10.488245 6.262007 001 2 0.119453 šŸ˜‚ _face_with_tears_of_joy_
3 9.791545 8.185851 004 10 0.116191 šŸ“· _camera_
4 10.235905 6.789739 005 2 0.114231 šŸ˜‚ _face_with_tears_of_joy_
... ... ... ... ... ... ... ...
29978 8.535691 8.772460 ĻƒĻ‡ 7 0.116272 ✨ _sparkles_
29979 11.129086 12.883891 ć‚¢ćƒ”ćƒŖć‚« 0 0.187796 ā¤ _red_heart_
29980 10.712729 15.200594 留学 0 0.288069 ā¤ _red_heart_
29981 11.727093 10.708837 ė‰“ģš• 0 0.131694 ā¤ _red_heart_
29982 10.266031 8.860102 토딠토 19 0.116800 😜 _winking_face_with_tongue_

29983 rows Ɨ 7 columns

In [25]:
data = []
for label in df_us_mapping["label"]:
    sub_df = df_umap[df_umap["label"] == label]
    data.append(
        go.Scattergl(
            x = sub_df[0],
            y = sub_df[1],
            mode='markers',
            text=sub_df["token"]+"<br>"+sub_df["emoji"]+"<br>"+sub_df["proba"].apply(lambda x: str(np.round(x, 3))),
            name=sub_df["emoji"].iloc[0],
            marker=dict(
                size=25*sub_df["proba"],
                line_width=0.2,
            )
        )
    )
    
fig = go.Figure(data=data)
fig.update_layout(
    title="Proyección (UMAP) de vectores de probabilidad de tokens",
    autosize=False,
    width=700,
    height=500,
)
fig.show(renderer="notebook")

Comentarios El top 5 de la seccion anterior se puede capturar con los primero cinco punto de mayor tamaƱo para un emoji dado. Tambien, se observa que la clase con mas puntos corresponde al emoji del corazon. Mismo emoji con mayor popularidad visto en la etapa de analisis de los datos. Se observan grupos diferenciados, pero que logran solaparse. Esta zona coincide con aquellos tokens con probabilidades uniformes de pertenecer a cada clase y/o con probabilidad maxima cercanas a 0.1.